library(EnsDb.Hsapiens.v86)
## Loading required package: ensembldb
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: GenomicFeatures
## Loading required package: AnnotationDbi
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: AnnotationFilter
## 
## Attaching package: 'ensembldb'
## The following object is masked from 'package:stats':
## 
##     filter
library(Signac)
library(Seurat)
## Attaching SeuratObject
library(GenomeInfoDb)
library(ggplot2)
library(patchwork)
library(Matrix)
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:S4Vectors':
## 
##     expand
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:ensembldb':
## 
##     filter, select
## The following object is masked from 'package:AnnotationDbi':
## 
##     select
## The following object is masked from 'package:Biobase':
## 
##     combine
## The following objects are masked from 'package:GenomicRanges':
## 
##     intersect, setdiff, union
## The following object is masked from 'package:GenomeInfoDb':
## 
##     intersect
## The following objects are masked from 'package:IRanges':
## 
##     collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
## 
##     first, intersect, rename, setdiff, setequal, union
## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(hdf5r)
## 
## Attaching package: 'hdf5r'
## The following object is masked from 'package:GenomicRanges':
## 
##     values
## The following object is masked from 'package:S4Vectors':
## 
##     values
library(future)
## 
## Attaching package: 'future'
## The following object is masked from 'package:AnnotationFilter':
## 
##     value
library(ggplot2)
library(pheatmap)
options(future.globals.maxSize = 100000 * 1024^2)

# Create Seurat objects with RNA data only from the Multi-ome data (Sole-Boldo et al., 2022)
matrix_dir = "/scratch/groups/khavari/users/andrewji/sb_multiome/Sole-Boldo_2022_Multiome/"
s_dir = c("S1/GSM6284671_S1_filtered_feature_bc_matrix.h5","S2/GSM6284672_S2_filtered_feature_bc_matrix.h5")

all_rna_data = list()
for (i in 1:length(s_dir))
{
  mat = Read10X_h5(paste(matrix_dir,s_dir[i],sep=""))
  mat_rna = mat[[1]]
  all_rna_data[[i]] = mat_rna
}
## Genome matrix has multiple modalities, returning a list of matrices for this genome
## Genome matrix has multiple modalities, returning a list of matrices for this genome
all_obj = list()
allnames = c("S1","S2")
for (i in 1:length(all_rna_data))
{
  obj <- CreateSeuratObject(counts = all_rna_data[[i]], project = allnames[i])
  obj[["percent.mt"]] <- PercentageFeatureSet(obj, pattern = "^MT-")
  obj = subset(obj, nFeature_RNA >= 200 & percent.mt < 10) 
  all_obj[[i]] = obj
}

# Normalize and identify variable features for each dataset independently
all_obj <- lapply(X = all_obj, FUN = function(x) {
    x <- NormalizeData(x)
    x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
})

# Select features that are repeatedly variable across datasets for integration
features <- SelectIntegrationFeatures(object.list = all_obj)
s_anchors <- FindIntegrationAnchors(object.list = all_obj, anchor.features = features)
## Warning in CheckDuplicateCellNames(object.list = object.list): Some cell names
## are duplicated across objects provided. Renaming to enforce unique cell names.
## Scaling features for provided objects
## Finding all pairwise anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 8691 anchors
## Filtering anchors
##  Retained 4083 anchors
# Integrate both samples
s_comb <- IntegrateData(anchorset = s_anchors)
## Merging dataset 2 into 1
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
# Perform dimensionality reduction on integrated object
DefaultAssay(s_comb) <- "integrated"

# Run the standard workflow for visualization and clustering
s_comb <- ScaleData(s_comb, verbose = FALSE)
s_comb <- RunPCA(s_comb, npcs = 30, verbose = FALSE)
s_comb <- RunUMAP(s_comb, reduction = "pca", dims = 1:30)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 08:53:26 UMAP embedding parameters a = 0.9922 b = 1.112
## 08:53:26 Read 6661 rows and found 30 numeric columns
## 08:53:26 Using Annoy for neighbor search, n_neighbors = 30
## 08:53:26 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 08:53:27 Writing NN index file to temp file /tmp/RtmpwYMQx5/file37542f22948e
## 08:53:27 Searching Annoy index using 1 thread, search_k = 3000
## 08:53:29 Annoy recall = 100%
## 08:53:31 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 08:53:34 Initializing from normalized Laplacian + noise (using irlba)
## 08:53:34 Commencing optimization for 500 epochs, with 282304 positive edges
## 08:53:52 Optimization finished
s_comb <- FindNeighbors(s_comb, reduction = "pca", dims = 1:30)
## Computing nearest neighbor graph
## Computing SNN
s_comb <- FindClusters(s_comb, resolution = 0.1)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 6661
## Number of edges: 258901
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9453
## Number of communities: 7
## Elapsed time: 0 seconds
s_comb <- FindClusters(s_comb, resolution = 0.2)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 6661
## Number of edges: 258901
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9170
## Number of communities: 9
## Elapsed time: 0 seconds
s_comb <- FindClusters(s_comb, resolution = 0.4)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 6661
## Number of edges: 258901
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8794
## Number of communities: 11
## Elapsed time: 0 seconds
s_comb <- FindClusters(s_comb, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 6661
## Number of edges: 258901
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8620
## Number of communities: 12
## Elapsed time: 0 seconds
s_comb <- FindClusters(s_comb, resolution = 0.6)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 6661
## Number of edges: 258901
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8484
## Number of communities: 12
## Elapsed time: 0 seconds
s_comb <- FindClusters(s_comb, resolution = 0.8)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 6661
## Number of edges: 258901
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8226
## Number of communities: 15
## Elapsed time: 0 seconds
# UMAP labeled by sample
DimPlot(s_comb, reduction = "umap", group.by = "orig.ident")

# UMAP labeled by clusters at res 0.8
DimPlot(s_comb, reduction = "umap", label = TRUE, repel = TRUE)

# Check expression of markers
marker_genes = c("KRT5","KRT14","COL17A1","KRT1","KRT10","MKI67")
marker_genes2 = c("IVL","KLK7","SOX9","HLA-DRA","MLANA","PTPRC")
FeaturePlot(object = s_comb, features = marker_genes, cols = c("lightgrey", "blue"), pt.size = 0.25, order=T, ncol = 3)
## Warning: Could not find KRT1 in the default search locations, found in RNA assay
## instead
## Warning: Could not find KRT10 in the default search locations, found in RNA
## assay instead

FeaturePlot(object = s_comb, features = marker_genes2, cols = c("lightgrey", "blue"), pt.size = 0.25, order=T, ncol = 3)

# Subset IFE cells and re-integrate
Idents(s_comb) = "integrated_snn_res.0.8"
s_comb_filt = subset(s_comb, idents = c(0:6,8:10,14))


# Add ATAC data to object
# Read in counts for  ATAC
all_atac_data = list()
for (i in 1:length(s_dir))
{
  mat = Read10X_h5(paste(matrix_dir,s_dir[i],sep=""))
  mat_atac = mat[[2]]
  all_atac_data[[i]] = mat_atac
}
## Genome matrix has multiple modalities, returning a list of matrices for this genome
## Genome matrix has multiple modalities, returning a list of matrices for this genome
s1_atac_counts = all_atac_data[[1]]
s1_grange.counts <- StringToGRanges(rownames(s1_atac_counts), sep = c(":", "-"))
s1_grange.use <- seqnames(s1_grange.counts) %in% standardChromosomes(s1_grange.counts)
s1_atac_counts <- s1_atac_counts[as.vector(s1_grange.use), ]
s1_annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86)
## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)
## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)
seqlevelsStyle(s1_annotations) <- 'UCSC'
genome(s1_annotations) <- "hg38"

frag.files = c("S1/GSM6284669_S1_atac_fragments.tsv.gz","S2/GSM6284670_S2_atac_fragments.tsv.gz")
s1_frag.file <- paste(matrix_dir,frag.files[1],sep="")
s1_chrom_assay <- CreateChromatinAssay(
   counts = s1_atac_counts,
   sep = c(":", "-"),
   genome = 'hg38',
   fragments = s1_frag.file,
   min.cells = 10,
   annotation = s1_annotations
 )
## Computing hash
s1_atac <- CreateSeuratObject(
  counts = s1_chrom_assay,
  assay = "peaks",
  #meta.data = metadata
)
## Warning: Keys should be one or more alphanumeric characters followed by an
## underscore, setting key from peaks to peaks_
s2_atac_counts = all_atac_data[[2]]
s2_grange.counts <- StringToGRanges(rownames(s2_atac_counts), sep = c(":", "-"))
s2_grange.use <- seqnames(s2_grange.counts) %in% standardChromosomes(s2_grange.counts)
s2_atac_counts <- s2_atac_counts[as.vector(s2_grange.use), ]
s2_annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86)
## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)
seqlevelsStyle(s2_annotations) <- 'UCSC'
genome(s2_annotations) <- "hg38"

s2_frag.file <- paste(matrix_dir,frag.files[2],sep="")
s2_chrom_assay <- CreateChromatinAssay(
   counts = s2_atac_counts,
   sep = c(":", "-"),
   genome = 'hg38',
   fragments = s2_frag.file,
   min.cells = 10,
   annotation = s2_annotations
 )
## Computing hash
s2_atac <- CreateSeuratObject(
  counts = s2_chrom_assay,
  assay = "peaks",
  #meta.data = metadata
)
## Warning: Keys should be one or more alphanumeric characters followed by an
## underscore, setting key from peaks to peaks_
DefaultAssay(s1_atac) = "peaks"
DefaultAssay(s2_atac) = "peaks"

# convert to genomic ranges
gr_s1 = granges(s1_atac)
gr_s2 = granges(s2_atac)

# Create a unified set of peaks to quantify in each dataset
combined.peaks <- reduce(x = c(gr_s1, gr_s2))

# Filter out bad peaks based on length
peakwidths <- width(combined.peaks)
combined.peaks <- combined.peaks[peakwidths  < 10000 & peakwidths > 20]
combined.peaks
## GRanges object with 80844 ranges and 0 metadata columns:
##           seqnames            ranges strand
##              <Rle>         <IRanges>  <Rle>
##       [1]     chr1     180790-181751      *
##       [2]     chr1     183867-184813      *
##       [3]     chr1     186562-187409      *
##       [4]     chr1     190968-191900      *
##       [5]     chr1     629432-630398      *
##       ...      ...               ...    ...
##   [80840]     chrY 56873534-56874398      *
##   [80841]     chrY 56877137-56878338      *
##   [80842]     chrY 56879505-56880375      *
##   [80843]     chrY 56883603-56884517      *
##   [80844]     chrY 56885148-56886181      *
##   -------
##   seqinfo: 24 sequences from an unspecified genome; no seqlengths
# Create new merged ATAC object
combined <- merge(
  x = s1_atac,
  y = s2_atac,
  add.cell.ids = c("S1","S2")
)
## 
## Binding matrix rows
## 
## Binding matrix rows
## 
## Binding matrix rows
## 
## Binding matrix rows
combined[["peaks"]]
## ChromatinAssay data with 80844 features for 6793 cells
## Variable features: 0 
## Genome: hg38 
## Annotation present: TRUE 
## Motifs present: FALSE 
## Fragment files: 2
# Match barcode names and filter combined for overlapping cells in s_comb_filt
s_comb_mo = s_comb_filt
col_split = strsplit(colnames(s_comb_mo),"_")
new_col = paste("S",unlist(lapply(col_split,"[[",2)),"_",unlist(lapply(col_split,"[[",1)),sep="")
s_comb_mo = RenameCells(s_comb_mo, new.names = new_col)
combined_filt = subset(combined, cells = colnames(s_comb_mo))

s_comb_mo[["ATAC"]] = combined_filt[["peaks"]]

# Dimensionality reduction with ATAC
DefaultAssay(s_comb_mo) <- "ATAC"
s_comb_mo <- RunTFIDF(s_comb_mo)
## Performing TF-IDF normalization
s_comb_mo <- FindTopFeatures(s_comb_mo, min.cutoff = 'q0')
s_comb_mo <- RunSVD(s_comb_mo)
## Running SVD
## Scaling cell embeddings
s_comb_mo <- RunUMAP(s_comb_mo, reduction = 'lsi', dims = 2:50, reduction.name = "umap.atac", reduction.key = "atacUMAP_")
## 09:00:54 UMAP embedding parameters a = 0.9922 b = 1.112
## 09:00:54 Read 6108 rows and found 49 numeric columns
## 09:00:54 Using Annoy for neighbor search, n_neighbors = 30
## 09:00:54 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 09:00:55 Writing NN index file to temp file /tmp/RtmpwYMQx5/file37544f079aac
## 09:00:55 Searching Annoy index using 1 thread, search_k = 3000
## 09:00:57 Annoy recall = 100%
## 09:00:59 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 09:01:03 Initializing from normalized Laplacian + noise (using irlba)
## 09:01:03 Commencing optimization for 500 epochs, with 244916 positive edges
## 09:01:20 Optimization finished
# Integrate ATAC objects
obj_list_atac = SplitObject(s_comb_mo,split.by = "orig.ident")

# find integration anchors
integration.anchors <- FindIntegrationAnchors(
  object.list = obj_list_atac,
  anchor.features = rownames(s_comb_mo),
  reduction = "rlsi",
  dims = 2:30
)
## Computing within dataset neighborhoods
## Finding all pairwise anchors
## Warning: No filtering performed if passing to data rather than counts
## Projecting new data onto SVD
## Projecting new data onto SVD
## Finding neighborhoods
## Finding anchors
##  Found 1942 anchors
# Create new object with integrated LSI embeddings
s_comb_atac = s_comb_mo
s_comb_atac <- IntegrateEmbeddings(
  anchorset = integration.anchors,
  reductions = s_comb_atac[["lsi"]],
  new.reduction.name = "integrated_lsi",
  dims.to.integrate = 1:30
)
## Merging dataset 1 into 2
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
# Integrate RNA objects
DefaultAssay(s_comb_mo) = "RNA"
obj_list = SplitObject(s_comb_mo,split.by = "orig.ident")

# normalize and identify variable features for each dataset independently
obj_list <- lapply(X = obj_list, FUN = function(x) {
    DefaultAssay(x) = "RNA"
    x <- NormalizeData(x)
    x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
})

features <- SelectIntegrationFeatures(object.list = obj_list)
s_anchors <- FindIntegrationAnchors(object.list = obj_list, anchor.features = features)
## Scaling features for provided objects
## Finding all pairwise anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 8587 anchors
## Filtering anchors
##  Retained 3515 anchors
s_comb_mo <- IntegrateData(anchorset = s_anchors, dims = 1:30)
## Merging dataset 1 into 2
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
DefaultAssay(s_comb_mo) = "integrated"

s_comb_mo <- ScaleData(s_comb_mo, verbose = FALSE)
s_comb_mo <- RunPCA(s_comb_mo, npcs = 30, verbose = FALSE)
s_comb_mo <- RunUMAP(s_comb_mo, reduction = "pca", dims = 1:30, reduction.name = 'umap.rna', reduction.key = 'rnaUMAP_')
## 09:04:39 UMAP embedding parameters a = 0.9922 b = 1.112
## 09:04:39 Read 6108 rows and found 30 numeric columns
## 09:04:39 Using Annoy for neighbor search, n_neighbors = 30
## 09:04:39 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 09:04:40 Writing NN index file to temp file /tmp/RtmpwYMQx5/file3754263916ba
## 09:04:40 Searching Annoy index using 1 thread, search_k = 3000
## 09:04:42 Annoy recall = 100%
## 09:04:44 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 09:04:47 Initializing from normalized Laplacian + noise (using irlba)
## 09:04:47 Commencing optimization for 500 epochs, with 260074 positive edges
## 09:05:03 Optimization finished
# RNA UMAP by sample identity
p1 = DimPlot(s_comb_mo, reduction = "umap.rna", label = F, group.by = "orig.ident") + NoLegend() + NoAxes()
print(p1)

# save png
png("s_comb_mo_predicted_id_umap_rna_orig.ident.png",width = 5.5, height = 5, res = 300, units = "in")
print(p1)
dev.off()
## png 
##   2
# Add ATAC integration to original object 
s_comb_mo[["integrated_lsi"]] = s_comb_atac[["integrated_lsi"]]

# Run UMAP on integrated ATAC objects
s_comb_mo <- RunUMAP(s_comb_mo, reduction = 'integrated_lsi', dims = 2:50, reduction.name = "umap.atac", reduction.key = "atacUMAP_")
## 09:05:07 UMAP embedding parameters a = 0.9922 b = 1.112
## 09:05:07 Read 6108 rows and found 49 numeric columns
## 09:05:07 Using Annoy for neighbor search, n_neighbors = 30
## 09:05:07 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 09:05:08 Writing NN index file to temp file /tmp/RtmpwYMQx5/file37541898f89a
## 09:05:08 Searching Annoy index using 1 thread, search_k = 3000
## 09:05:10 Annoy recall = 100%
## 09:05:12 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 09:05:16 Initializing from normalized Laplacian + noise (using irlba)
## 09:05:16 Commencing optimization for 500 epochs, with 259218 positive edges
## 09:05:34 Optimization finished
setwd("/scratch/groups/khavari/users/andrewji/sb_multiome/s_comb_mo/render")
# ATAC UMAP labeled by sample identity
p1 = DimPlot(s_comb_mo, reduction = "umap.atac", label = F, group.by = "orig.ident") + NoLegend() + NoAxes()
print(p1)

# save png
png("s_comb_mo_predicted_id_umap_atac_origident.png",width = 5.5, height = 5, res = 300, units = "in")
print(p1)
dev.off()
## png 
##   2
# Predict AJ cohort IFE clusters 
DefaultAssay(s_comb_mo) = "integrated"

# Load AJ cohort IFE object
kc_int_filt = readRDS("/oak/stanford/groups/khavari/users/andrewji/normal_skin/kc_norm/kc_int/kc_int_filt/kc_int_filt.Rds")

Idents(kc_int_filt) = "integrated_snn_res.0.4"
kc.anchors <- FindTransferAnchors(reference = kc_int_filt, query = s_comb_mo, 
                                  dims = 1:30, k.filter=NA)
## Performing PCA on the provided reference using 622 features as input.
## Projecting cell embeddings
## Finding neighborhoods
## Finding anchors
##  Found 1956 anchors
kc_predictions <- TransferData(anchorset = kc.anchors, refdata = kc_int_filt$integrated_snn_res.0.4, 
                               dims = 1:30)
## Finding integration vectors
## Finding integration vector weights
## Predicting cell labels
s_comb_mo <- AddMetaData(s_comb_mo, metadata = kc_predictions)
Idents(s_comb_mo) = "predicted.id"
p = DimPlot(object = s_comb_mo, group.by = "predicted.id", label = TRUE)
print(p)

# Plot by predicted clusters with matched colors as AJ cohort IFE (0 cells with predicted cluster 11)
Idents(s_comb_mo) = "predicted.id"
my_levels = c(0:10)
match_cols = scales::hue_pal()(12)[1:11]
Idents(s_comb_mo) = factor(x = Idents(s_comb_mo), levels = my_levels)
# RNA UMAP and save png
p1 <- DimPlot(s_comb_mo, reduction = "umap.rna", label = TRUE, cols = match_cols, label.size = 8) + NoLegend() + NoAxes()
print(p1)

png("s_comb_mo_predicted_id_umap_rna_match_color.png",width = 5.5, height = 5, res = 300, units = "in")
print(p1)
dev.off()
## png 
##   2
# ATAC UMAP and save png
p1 <- DimPlot(s_comb_mo, reduction = "umap.atac", label = TRUE, cols = match_cols, label.size = 8) + NoLegend() + NoAxes()
print(p1)

png("s_comb_mo_predicted_id_umap_atac_match_color.png",width = 5.5, height = 5, res = 300, units = "in")
print(p1)
dev.off()
## png 
##   2
# Highlight cluster 2 predicted cells (Spinous II cells)
# RNA UMAP
p = DimPlot(object = s_comb_mo, reduction = "umap.rna", group.by = "predicted.id", label = F, cells.highlight = WhichCells(s_comb_mo,idents=2)) + NoAxes() + NoLegend()
print(p)

# save png
png("s_comb_mo_umap_rna_highlight_predicted_2.png",width = 5, height = 5, res = 300, units = "in")
print(p)
dev.off()
## png 
##   2
# ATAC UMAP
p = DimPlot(object = s_comb_mo, reduction = "umap.atac", group.by = "predicted.id", label = F, cells.highlight = WhichCells(s_comb_mo,idents=2)) + NoAxes() + NoLegend()
print(p)

# save png
png("s_comb_mo_umap_atac_highlight_predicted_2.png",width = 5, height = 5, res = 300, units = "in")
print(p)
dev.off()
## png 
##   2
# Feature plots for predicted cluster scores
predicted_id = paste("prediction.score.",0:11,sep="")
p = FeaturePlot(object = s_comb_mo, features = predicted_id[1:6], cols = c("lightgrey", "blue"), pt.size = 0.25, order=T, ncol = 3)
print(p)

p = FeaturePlot(object = s_comb_mo, features = predicted_id[7:12], cols = c("lightgrey", "blue"), pt.size = 0.25, order=T, ncol = 3)
## Warning in FeaturePlot(object = s_comb_mo, features = predicted_id[7:12], : All
## cells have the same value (0) of prediction.score.11.
print(p)

# save png
png("s_comb_mo_featureplot_predicted_id_scores.png",width = 10, height = 14, res = 300, units = "in")
FeaturePlot(object = s_comb_mo, features = predicted_id[1:6], cols = c("lightgrey", "blue"), pt.size = 0.25, order=T, ncol = 3)
dev.off()
## png 
##   2
# Order clusters from basal to granular
my_levels = c(3,4,6,5,8,7,2,0,1,9,10)
Idents(s_comb_mo) = factor(x = Idents(s_comb_mo), levels = my_levels)


# Coverage plots for major predicted basal, spinous, and granular clusters
p = CoveragePlot(s_comb_mo, region = "KRT5", extend.upstream = 2000, extend.downstream = 2000, features = "KRT5", assay = 'ATAC', expression.assay = 'RNA', peaks = T, idents = c(3,4,2,1,10)) 
p = p & scale_fill_manual(values = match_cols[c(4,5,3,2,11)])
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
print(p)

p = CoveragePlot(s_comb_mo, region = "KRT14", extend.upstream = 2000, extend.downstream = 2000, features = "KRT14", assay = 'ATAC', expression.assay = 'RNA', peaks = T, idents = c(3,4,2,1,10))
p = p & scale_fill_manual(values = match_cols[c(4,5,3,2,11)])
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
print(p)

p = CoveragePlot(s_comb_mo, region = "KRT1", extend.upstream = 2000, extend.downstream = 2000, features = "KRT1", assay = 'ATAC', expression.assay = 'RNA', peaks = T, idents = c(3,4,2,1,10))
p = p & scale_fill_manual(values = match_cols[c(4,5,3,2,11)])
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
print(p)

p = CoveragePlot(s_comb_mo, region = "KRT10", extend.upstream = 2000, extend.downstream = 2000, features = "KRT10", assay = 'ATAC', expression.assay = 'RNA', peaks = T, idents = c(3,4,2,1,10))
p = p & scale_fill_manual(values = match_cols[c(4,5,3,2,11)])
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
print(p)
## Warning: Removed 7 rows containing missing values (`geom_segment()`).

# Get differentially accessible peaks (code run separately to save time)
# Idents(s_comb_mo) = "predicted.id"
# da_peaks_all <- FindAllMarkers(
#   object = s_comb_mo,
#   test.use = 'LR',
#   latent.vars = 'atac_peak_region_fragments',
#   min.pct = 0.05
# )
# write.table(da_peaks_all,file="s_comb_mo_kc_int_filt_res0.4_predict_da_peaks.csv",sep=",",row.names=T,col.names=T)

# Print heatmap of differentially accessible peaks
s_comb_mo_da_peaks = read.table("/scratch/groups/khavari/users/andrewji/sb_multiome/s_comb_mo/s_comb_mo_kc_int_filt_res0.4_predict_da_peaks.csv",sep=",",row.names=1,header=T,stringsAsFactors=F)
s_comb_mo_da_peaks_sig = subset(s_comb_mo_da_peaks, p_val < 0.005)
s_comb_mo_exp = as.matrix(s_comb_mo[["ATAC"]]@data)
## Warning in asMethod(object): sparse->dense coercion: allocating vector of size
## 3.7 GiB
# Calculate average accessibility per predicted cluster (exclude clus 9 given low number of cells)
comb_avg_exp = c()
clus = c(0:8,10)
for (i in 1:length(clus)) {
  cells = WhichCells(s_comb_mo, idents = clus[i])
  exp_mat = s_comb_mo_exp[,cells]
  if (length(cells)>1) {avg = apply(exp_mat,1,mean)}
  else {avg = exp_mat}
  comb_avg_exp = cbind(comb_avg_exp,avg)
}
colnames(comb_avg_exp) = clus

col.pal <- rev(RColorBrewer::brewer.pal(11, "RdBu"))
da_peaks_sig = unique(s_comb_mo_da_peaks_sig$gene)
atac_heat_map = pheatmap(comb_avg_exp[da_peaks_sig,as.character(c(3,4,2,1,10))], show_rownames = F, cluster_cols = T, cluster_rows = T, main = "Peaks", scale = "row",color = col.pal)

setwd("/scratch/groups/khavari/users/andrewji/sb_multiome/s_comb_mo/render")
ggsave(atac_heat_map,file="s_comb_mo_sub_da_peaks_p_0.005_clustered.pdf",height = 10, width=10)

# Make bar plots of significantly up/down accessible peaks in clusters 3,4,2,1,0 (main predicted basal, spionous, and granular clusters)
sub_sig = subset(s_comb_mo_da_peaks_sig, cluster %in% c(1,2,3,4,10))
sub_sig_up = subset(sub_sig, avg_log2FC > 0)
sub_sig_down = subset(sub_sig,avg_log2FC<0)
peak_table_up = table(sub_sig_up$cluster)
peak_table_down = table(sub_sig_down$cluster)
peak_df = rbind(peak_table_up,peak_table_down)
rownames(peak_df) = c("Up","Down")
peak_df_melt = reshape2::melt(peak_df)
peak_df_melt$Var2 <- as.factor(peak_df_melt$Var2)
peak_df_melt$Var2 = factor(peak_df_melt$Var2, levels = c(3,4,2,1,10))
peak_df_melt$new_value <- with(peak_df_melt, ifelse(Var1 == "Up", value, -value))
#new_df_melt <- with(peak_df_melt, ifelse(Var1 == "Down", value, -value)) 
g = ggplot(peak_df_melt, aes(x=Var2, y=new_value, fill=Var1)) + 
  geom_bar(stat="identity", position="identity") + ylab("Number of Differentially Accessible Peaks") + xlab("Predicted Cluster")
print(g)

ggsave(filename = "s_comb_mo_num_da_peaks_sub_clus.pdf",g,width = 5,height = 6, useDingbats=F)


setwd("/scratch/groups/khavari/users/andrewji/sb_multiome/s_comb_mo/render")
saveRDS(s_comb_mo,file="s_comb_mo.Rds")

sessionInfo()
## R version 4.2.0 (2022-04-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
## 
## Matrix products: default
## BLAS/LAPACK: /share/software/user/open/openblas/0.3.10/lib/libopenblas_haswellp-r0.3.10.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] pheatmap_1.0.12           future_1.30.0            
##  [3] hdf5r_1.3.7               dplyr_1.0.10             
##  [5] Matrix_1.5-3              patchwork_1.1.2          
##  [7] ggplot2_3.4.0             SeuratObject_4.1.3       
##  [9] Seurat_4.3.0              Signac_1.9.0             
## [11] EnsDb.Hsapiens.v86_2.99.0 ensembldb_2.22.0         
## [13] AnnotationFilter_1.22.0   GenomicFeatures_1.50.3   
## [15] AnnotationDbi_1.60.0      Biobase_2.58.0           
## [17] GenomicRanges_1.50.2      GenomeInfoDb_1.34.4      
## [19] IRanges_2.32.0            S4Vectors_0.36.1         
## [21] BiocGenerics_0.44.0      
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.2                  spatstat.explore_3.0-5     
##   [3] reticulate_1.26             tidyselect_1.2.0           
##   [5] RSQLite_2.2.20              htmlwidgets_1.6.0          
##   [7] grid_4.2.0                  BiocParallel_1.32.5        
##   [9] Rtsne_0.16                  munsell_0.5.0              
##  [11] codetools_0.2-18            ica_1.0-3                  
##  [13] interp_1.1-3                miniUI_0.1.1.1             
##  [15] withr_2.5.0                 spatstat.random_3.0-1      
##  [17] colorspace_2.0-3            progressr_0.12.0           
##  [19] filelock_1.0.2              highr_0.9                  
##  [21] knitr_1.39                  rstudioapi_0.14            
##  [23] ROCR_1.0-11                 tensor_1.5                 
##  [25] listenv_0.9.0               MatrixGenerics_1.10.0      
##  [27] labeling_0.4.2              GenomeInfoDbData_1.2.9     
##  [29] polyclip_1.10-4             farver_2.1.1               
##  [31] bit64_4.0.5                 parallelly_1.33.0          
##  [33] vctrs_0.5.1                 generics_0.1.3             
##  [35] xfun_0.32                   biovizBase_1.46.0          
##  [37] BiocFileCache_2.6.0         R6_2.5.1                   
##  [39] bitops_1.0-7                spatstat.utils_3.0-1       
##  [41] cachem_1.0.6                DelayedArray_0.24.0        
##  [43] assertthat_0.2.1            promises_1.2.0.1           
##  [45] BiocIO_1.8.0                scales_1.2.1               
##  [47] nnet_7.3-17                 gtable_0.3.1               
##  [49] globals_0.16.2              goftest_1.2-3              
##  [51] rlang_1.0.6                 RcppRoll_0.3.0             
##  [53] splines_4.2.0               rtracklayer_1.58.0         
##  [55] lazyeval_0.2.2              dichromat_2.0-0.1          
##  [57] checkmate_2.1.0             spatstat.geom_3.0-3        
##  [59] yaml_2.3.5                  reshape2_1.4.4             
##  [61] abind_1.4-5                 backports_1.4.1            
##  [63] httpuv_1.6.7                Hmisc_4.7-2                
##  [65] tools_4.2.0                 ellipsis_0.3.2             
##  [67] jquerylib_0.1.4             RColorBrewer_1.1-3         
##  [69] ggridges_0.5.4              Rcpp_1.0.9                 
##  [71] plyr_1.8.8                  base64enc_0.1-3            
##  [73] progress_1.2.2              zlibbioc_1.44.0            
##  [75] purrr_0.3.4                 RCurl_1.98-1.9             
##  [77] prettyunits_1.1.1           rpart_4.1.16               
##  [79] deldir_1.0-6                pbapply_1.6-0              
##  [81] cowplot_1.1.1               zoo_1.8-11                 
##  [83] SummarizedExperiment_1.28.0 ggrepel_0.9.2              
##  [85] cluster_2.1.3               magrittr_2.0.3             
##  [87] data.table_1.14.6           scattermore_0.8            
##  [89] lmtest_0.9-40               RANN_2.6.1                 
##  [91] ProtGenerics_1.30.0         fitdistrplus_1.1-8         
##  [93] matrixStats_0.63.0          hms_1.1.2                  
##  [95] mime_0.12                   evaluate_0.16              
##  [97] xtable_1.8-4                XML_3.99-0.13              
##  [99] jpeg_0.1-10                 gridExtra_2.3              
## [101] compiler_4.2.0              biomaRt_2.54.0             
## [103] tibble_3.1.8                KernSmooth_2.23-20         
## [105] crayon_1.5.1                htmltools_0.5.4            
## [107] later_1.3.0                 Formula_1.2-4              
## [109] tidyr_1.2.1                 DBI_1.1.3                  
## [111] dbplyr_2.2.1                MASS_7.3-56                
## [113] rappdirs_0.3.3              cli_3.5.0                  
## [115] parallel_4.2.0              igraph_1.3.5               
## [117] pkgconfig_2.0.3             GenomicAlignments_1.34.0   
## [119] foreign_0.8-82              sp_1.5-1                   
## [121] plotly_4.10.1               spatstat.sparse_3.0-0      
## [123] xml2_1.3.3                  bslib_0.4.0                
## [125] XVector_0.38.0              VariantAnnotation_1.44.0   
## [127] stringr_1.4.0               digest_0.6.29              
## [129] sctransform_0.3.5           RcppAnnoy_0.0.20           
## [131] spatstat.data_3.0-0         Biostrings_2.66.0          
## [133] rmarkdown_2.15              leiden_0.4.3               
## [135] fastmatch_1.1-3             htmlTable_2.4.1            
## [137] uwot_0.1.14                 restfulr_0.0.15            
## [139] curl_4.3.3                  shiny_1.7.4                
## [141] Rsamtools_2.14.0            rjson_0.2.21               
## [143] lifecycle_1.0.3             nlme_3.1-157               
## [145] jsonlite_1.8.4              BSgenome_1.66.1            
## [147] viridisLite_0.4.1           fansi_1.0.3                
## [149] pillar_1.8.1                lattice_0.20-45            
## [151] KEGGREST_1.38.0             fastmap_1.1.0              
## [153] httr_1.4.4                  survival_3.3-1             
## [155] glue_1.6.2                  png_0.1-8                  
## [157] bit_4.0.5                   stringi_1.7.8              
## [159] sass_0.4.2                  blob_1.2.3                 
## [161] latticeExtra_0.6-30         memoise_2.0.1              
## [163] irlba_2.3.5.1               future.apply_1.10.0